; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt %s -S -passes=msan 2>&1 | FileCheck %s

target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

define <16 x i16> @test_x86_avx2_packssdw(<8 x i32> %a0, <8 x i32> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_packssdw(
; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i32> [[TMP1]], zeroinitializer
; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i32>
; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i32> [[TMP2]], zeroinitializer
; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i32>
; CHECK-NEXT:    [[_MSPROP_VECTOR_PACK:%.*]] = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> [[TMP4]], <8 x i32> [[TMP6]])
; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> [[A0:%.*]], <8 x i32> [[A1:%.*]])
; CHECK-NEXT:    store <16 x i16> [[_MSPROP_VECTOR_PACK]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <16 x i16> [[RES]]
;
  %res = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1) ; <<16 x i16>> [#uses=1]
  ret <16 x i16> %res
}
declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone


define <16 x i16> @test_x86_avx2_packssdw_fold() #0 {
; CHECK-LABEL: @test_x86_avx2_packssdw_fold(
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[_MSPROP_VECTOR_PACK:%.*]] = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> zeroinitializer, <8 x i32> zeroinitializer)
; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> zeroinitializer, <8 x i32> <i32 255, i32 32767, i32 65535, i32 -1, i32 -32767, i32 -65535, i32 0, i32 -256>)
; CHECK-NEXT:    store <16 x i16> [[_MSPROP_VECTOR_PACK]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <16 x i16> [[RES]]
;
  %res = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> zeroinitializer, <8 x i32> <i32 255, i32 32767, i32 65535, i32 -1, i32 -32767, i32 -65535, i32 0, i32 -256>)
  ret <16 x i16> %res
}


define <32 x i8> @test_x86_avx2_packsswb(<16 x i16> %a0, <16 x i16> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_packsswb(
; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i16> [[TMP1]], zeroinitializer
; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i16>
; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i16>
; CHECK-NEXT:    [[_MSPROP_VECTOR_PACK:%.*]] = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> [[TMP4]], <16 x i16> [[TMP6]])
; CHECK-NEXT:    [[RES:%.*]] = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
; CHECK-NEXT:    store <32 x i8> [[_MSPROP_VECTOR_PACK]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <32 x i8> [[RES]]
;
  %res = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1) ; <<32 x i8>> [#uses=1]
  ret <32 x i8> %res
}
declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone


define <32 x i8> @test_x86_avx2_packsswb_fold() #0 {
; CHECK-LABEL: @test_x86_avx2_packsswb_fold(
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[_MSPROP_VECTOR_PACK:%.*]] = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> zeroinitializer, <16 x i16> zeroinitializer)
; CHECK-NEXT:    [[RES:%.*]] = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> <i16 0, i16 255, i16 256, i16 -1, i16 -1, i16 -255, i16 -256, i16 -32678, i16 0, i16 255, i16 256, i16 -1, i16 -1, i16 -255, i16 -256, i16 -32678>, <16 x i16> zeroinitializer)
; CHECK-NEXT:    store <32 x i8> [[_MSPROP_VECTOR_PACK]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <32 x i8> [[RES]]
;
  %res = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> <i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678, i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678>, <16 x i16> zeroinitializer)
  ret <32 x i8> %res
}


define <32 x i8> @test_x86_avx2_packuswb(<16 x i16> %a0, <16 x i16> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_packuswb(
; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i16> [[TMP1]], zeroinitializer
; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i16>
; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i16>
; CHECK-NEXT:    [[_MSPROP_VECTOR_PACK:%.*]] = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> [[TMP4]], <16 x i16> [[TMP6]])
; CHECK-NEXT:    [[RES:%.*]] = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
; CHECK-NEXT:    store <32 x i8> [[_MSPROP_VECTOR_PACK]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <32 x i8> [[RES]]
;
  %res = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a0, <16 x i16> %a1) ; <<32 x i8>> [#uses=1]
  ret <32 x i8> %res
}
declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone


define <32 x i8> @test_x86_avx2_packuswb_fold() #0 {
; CHECK-LABEL: @test_x86_avx2_packuswb_fold(
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[_MSPROP_VECTOR_PACK:%.*]] = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> zeroinitializer, <16 x i16> zeroinitializer)
; CHECK-NEXT:    [[RES:%.*]] = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> <i16 0, i16 255, i16 256, i16 -1, i16 -1, i16 -255, i16 -256, i16 -32678, i16 0, i16 255, i16 256, i16 -1, i16 -1, i16 -255, i16 -256, i16 -32678>, <16 x i16> zeroinitializer)
; CHECK-NEXT:    store <32 x i8> [[_MSPROP_VECTOR_PACK]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <32 x i8> [[RES]]
;
  %res = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> <i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678, i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678>, <16 x i16> zeroinitializer)
  ret <32 x i8> %res
}


define <32 x i8> @test_x86_avx2_pavg_b(<32 x i8> %a0, <32 x i8> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_pavg_b(
; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
; CHECK-NEXT:    [[RES:%.*]] = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> [[A0:%.*]], <32 x i8> [[A1:%.*]])
; CHECK-NEXT:    store <32 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <32 x i8> [[RES]]
;
  %res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
  ret <32 x i8> %res
}
declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone


define <16 x i16> @test_x86_avx2_pavg_w(<16 x i16> %a0, <16 x i16> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_pavg_w(
; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
; CHECK-NEXT:    store <16 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <16 x i16> [[RES]]
;
  %res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
  ret <16 x i16> %res
}
declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone


define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_pmadd_wd(
; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP3:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i16> [[TMP3]] to <8 x i32>
; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i32> [[TMP4]], zeroinitializer
; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i32>
; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
; CHECK-NEXT:    store <8 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <8 x i32> [[RES]]
;
  %res = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1) ; <<8 x i32>> [#uses=1]
  ret <8 x i32> %res
}
declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone


define i32 @test_x86_avx2_pmovmskb(<32 x i8> %a0) #0 {
; CHECK-LABEL: @test_x86_avx2_pmovmskb(
; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <32 x i8> [[TMP1]] to i256
; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0:![0-9]+]]
; CHECK:       3:
; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6:[0-9]+]]
; CHECK-NEXT:    unreachable
; CHECK:       4:
; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> [[A0:%.*]])
; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret i32 [[RES]]
;
  %res = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %a0) ; <i32> [#uses=1]
  ret i32 %res
}
declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>) nounwind readnone


define <16 x i16> @test_x86_avx2_pmulh_w(<16 x i16> %a0, <16 x i16> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_pmulh_w(
; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
; CHECK-NEXT:    store <16 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <16 x i16> [[RES]]
;
  %res = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
  ret <16 x i16> %res
}
declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readnone


define <16 x i16> @test_x86_avx2_pmulhu_w(<16 x i16> %a0, <16 x i16> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_pmulhu_w(
; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
; CHECK-NEXT:    store <16 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <16 x i16> [[RES]]
;
  %res = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
  ret <16 x i16> %res
}
declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind readnone


define <4 x i64> @test_x86_avx2_psad_bw(<32 x i8> %a0, <32 x i8> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_psad_bw(
; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP3:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i8> [[TMP3]] to <4 x i64>
; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <4 x i64> [[TMP4]], zeroinitializer
; CHECK-NEXT:    [[TMP6:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i64>
; CHECK-NEXT:    [[TMP7:%.*]] = lshr <4 x i64> [[TMP6]], <i64 48, i64 48, i64 48, i64 48>
; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> [[A0:%.*]], <32 x i8> [[A1:%.*]])
; CHECK-NEXT:    store <4 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <4 x i64> [[RES]]
;
  %res = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %a0, <32 x i8> %a1) ; <<4 x i64>> [#uses=1]
  ret <4 x i64> %res
}
declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone


define <8 x i32> @test_x86_avx2_psll_d(<8 x i32> %a0, <4 x i32> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_psll_d(
; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
; CHECK-NEXT:    [[TMP6:%.*]] = sext i1 [[TMP5]] to i256
; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i256 [[TMP6]] to <8 x i32>
; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i32> [[TMP8]], [[TMP7]]
; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> [[A0:%.*]], <4 x i32> [[A1]])
; CHECK-NEXT:    store <8 x i32> [[TMP9]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <8 x i32> [[RES]]
;
  %res = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %a0, <4 x i32> %a1) ; <<8 x i32>> [#uses=1]
  ret <8 x i32> %res
}
declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone


define <4 x i64> @test_x86_avx2_psll_q(<4 x i64> %a0, <2 x i64> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_psll_q(
; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
; CHECK-NEXT:    [[TMP6:%.*]] = sext i1 [[TMP5]] to i256
; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i256 [[TMP6]] to <4 x i64>
; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i64> [[TMP8]], [[TMP7]]
; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> [[A0:%.*]], <2 x i64> [[A1]])
; CHECK-NEXT:    store <4 x i64> [[TMP9]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <4 x i64> [[RES]]
;
  %res = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1]
  ret <4 x i64> %res
}
declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone


define <16 x i16> @test_x86_avx2_psll_w(<16 x i16> %a0, <8 x i16> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_psll_w(
; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
; CHECK-NEXT:    [[TMP6:%.*]] = sext i1 [[TMP5]] to i256
; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i256 [[TMP6]] to <16 x i16>
; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> [[TMP1]], <8 x i16> [[A1:%.*]])
; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i16> [[TMP8]], [[TMP7]]
; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> [[A0:%.*]], <8 x i16> [[A1]])
; CHECK-NEXT:    store <16 x i16> [[TMP9]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <16 x i16> [[RES]]
;
  %res = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %a0, <8 x i16> %a1) ; <<16 x i16>> [#uses=1]
  ret <16 x i16> %res
}
declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone


define <8 x i32> @test_x86_avx2_pslli_d(<8 x i32> %a0) #0 {
; CHECK-LABEL: @test_x86_avx2_pslli_d(
; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> [[TMP1]], i32 7)
; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i32> [[TMP2]], zeroinitializer
; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> [[A0:%.*]], i32 7)
; CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <8 x i32> [[RES]]
;
  %res = call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %a0, i32 7) ; <<8 x i32>> [#uses=1]
  ret <8 x i32> %res
}
declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) nounwind readnone


define <4 x i64> @test_x86_avx2_pslli_q(<4 x i64> %a0) #0 {
; CHECK-LABEL: @test_x86_avx2_pslli_q(
; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> [[TMP1]], i32 7)
; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i64> [[TMP2]], zeroinitializer
; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> [[A0:%.*]], i32 7)
; CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <4 x i64> [[RES]]
;
  %res = call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
  ret <4 x i64> %res
}
declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) nounwind readnone


define <16 x i16> @test_x86_avx2_pslli_w(<16 x i16> %a0) #0 {
; CHECK-LABEL: @test_x86_avx2_pslli_w(
; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> [[TMP1]], i32 7)
; CHECK-NEXT:    [[TMP3:%.*]] = or <16 x i16> [[TMP2]], zeroinitializer
; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> [[A0:%.*]], i32 7)
; CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <16 x i16> [[RES]]
;
  %res = call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %a0, i32 7) ; <<16 x i16>> [#uses=1]
  ret <16 x i16> %res
}
declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) nounwind readnone


define <8 x i32> @test_x86_avx2_psra_d(<8 x i32> %a0, <4 x i32> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_psra_d(
; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
; CHECK-NEXT:    [[TMP6:%.*]] = sext i1 [[TMP5]] to i256
; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i256 [[TMP6]] to <8 x i32>
; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i32> [[TMP8]], [[TMP7]]
; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> [[A0:%.*]], <4 x i32> [[A1]])
; CHECK-NEXT:    store <8 x i32> [[TMP9]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <8 x i32> [[RES]]
;
  %res = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %a0, <4 x i32> %a1) ; <<8 x i32>> [#uses=1]
  ret <8 x i32> %res
}
declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone


define <16 x i16> @test_x86_avx2_psra_w(<16 x i16> %a0, <8 x i16> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_psra_w(
; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
; CHECK-NEXT:    [[TMP6:%.*]] = sext i1 [[TMP5]] to i256
; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i256 [[TMP6]] to <16 x i16>
; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> [[TMP1]], <8 x i16> [[A1:%.*]])
; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i16> [[TMP8]], [[TMP7]]
; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> [[A0:%.*]], <8 x i16> [[A1]])
; CHECK-NEXT:    store <16 x i16> [[TMP9]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <16 x i16> [[RES]]
;
  %res = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %a0, <8 x i16> %a1) ; <<16 x i16>> [#uses=1]
  ret <16 x i16> %res
}
declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone


define <8 x i32> @test_x86_avx2_psrai_d(<8 x i32> %a0) #0 {
; CHECK-LABEL: @test_x86_avx2_psrai_d(
; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> [[TMP1]], i32 7)
; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i32> [[TMP2]], zeroinitializer
; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> [[A0:%.*]], i32 7)
; CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <8 x i32> [[RES]]
;
  %res = call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %a0, i32 7) ; <<8 x i32>> [#uses=1]
  ret <8 x i32> %res
}
declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) nounwind readnone


define <16 x i16> @test_x86_avx2_psrai_w(<16 x i16> %a0) #0 {
; CHECK-LABEL: @test_x86_avx2_psrai_w(
; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> [[TMP1]], i32 7)
; CHECK-NEXT:    [[TMP3:%.*]] = or <16 x i16> [[TMP2]], zeroinitializer
; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> [[A0:%.*]], i32 7)
; CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <16 x i16> [[RES]]
;
  %res = call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %a0, i32 7) ; <<16 x i16>> [#uses=1]
  ret <16 x i16> %res
}
declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) nounwind readnone


define <8 x i32> @test_x86_avx2_psrl_d(<8 x i32> %a0, <4 x i32> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_psrl_d(
; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
; CHECK-NEXT:    [[TMP6:%.*]] = sext i1 [[TMP5]] to i256
; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i256 [[TMP6]] to <8 x i32>
; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i32> [[TMP8]], [[TMP7]]
; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> [[A0:%.*]], <4 x i32> [[A1]])
; CHECK-NEXT:    store <8 x i32> [[TMP9]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <8 x i32> [[RES]]
;
  %res = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %a0, <4 x i32> %a1) ; <<8 x i32>> [#uses=1]
  ret <8 x i32> %res
}
declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone


define <4 x i64> @test_x86_avx2_psrl_q(<4 x i64> %a0, <2 x i64> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_psrl_q(
; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
; CHECK-NEXT:    [[TMP6:%.*]] = sext i1 [[TMP5]] to i256
; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i256 [[TMP6]] to <4 x i64>
; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i64> [[TMP8]], [[TMP7]]
; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> [[A0:%.*]], <2 x i64> [[A1]])
; CHECK-NEXT:    store <4 x i64> [[TMP9]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <4 x i64> [[RES]]
;
  %res = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1]
  ret <4 x i64> %res
}
declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone


define <16 x i16> @test_x86_avx2_psrl_w(<16 x i16> %a0, <8 x i16> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_psrl_w(
; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
; CHECK-NEXT:    [[TMP6:%.*]] = sext i1 [[TMP5]] to i256
; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i256 [[TMP6]] to <16 x i16>
; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> [[TMP1]], <8 x i16> [[A1:%.*]])
; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i16> [[TMP8]], [[TMP7]]
; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> [[A0:%.*]], <8 x i16> [[A1]])
; CHECK-NEXT:    store <16 x i16> [[TMP9]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <16 x i16> [[RES]]
;
  %res = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %a0, <8 x i16> %a1) ; <<16 x i16>> [#uses=1]
  ret <16 x i16> %res
}
declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone


define <16 x i16> @test_x86_avx2_psrl_w_load(<16 x i16> %a0, ptr %p) #0 {
; CHECK-LABEL: @test_x86_avx2_psrl_w_load(
; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
; CHECK:       3:
; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT:    unreachable
; CHECK:       4:
; CHECK-NEXT:    [[A1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 16
; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP7]], align 16
; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i16> [[_MSLD]] to i128
; CHECK-NEXT:    [[TMP9:%.*]] = trunc i128 [[TMP8]] to i64
; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
; CHECK-NEXT:    [[TMP11:%.*]] = sext i1 [[TMP10]] to i256
; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i256 [[TMP11]] to <16 x i16>
; CHECK-NEXT:    [[TMP13:%.*]] = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> [[TMP2]], <8 x i16> [[A1]])
; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i16> [[TMP13]], [[TMP12]]
; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> [[A0:%.*]], <8 x i16> [[A1]])
; CHECK-NEXT:    store <16 x i16> [[TMP14]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <16 x i16> [[RES]]
;
  %a1 = load <8 x i16>, ptr %p
  %res = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %a0, <8 x i16> %a1) ; <<16 x i16>> [#uses=1]
  ret <16 x i16> %res
}


define <8 x i32> @test_x86_avx2_psrli_d(<8 x i32> %a0) #0 {
; CHECK-LABEL: @test_x86_avx2_psrli_d(
; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> [[TMP1]], i32 7)
; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i32> [[TMP2]], zeroinitializer
; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> [[A0:%.*]], i32 7)
; CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <8 x i32> [[RES]]
;
  %res = call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %a0, i32 7) ; <<8 x i32>> [#uses=1]
  ret <8 x i32> %res
}
declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) nounwind readnone


define <4 x i64> @test_x86_avx2_psrli_q(<4 x i64> %a0) #0 {
; CHECK-LABEL: @test_x86_avx2_psrli_q(
; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> [[TMP1]], i32 7)
; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i64> [[TMP2]], zeroinitializer
; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> [[A0:%.*]], i32 7)
; CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <4 x i64> [[RES]]
;
  %res = call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
  ret <4 x i64> %res
}
declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) nounwind readnone


define <16 x i16> @test_x86_avx2_psrli_w(<16 x i16> %a0) #0 {
; CHECK-LABEL: @test_x86_avx2_psrli_w(
; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> [[TMP1]], i32 7)
; CHECK-NEXT:    [[TMP3:%.*]] = or <16 x i16> [[TMP2]], zeroinitializer
; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> [[A0:%.*]], i32 7)
; CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <16 x i16> [[RES]]
;
  %res = call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %a0, i32 7) ; <<16 x i16>> [#uses=1]
  ret <16 x i16> %res
}
declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) nounwind readnone


define <8 x i32> @test_x86_avx2_phadd_d(<8 x i32> %a0, <8 x i32> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_phadd_d(
; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> [[A0:%.*]], <8 x i32> [[A1:%.*]])
; CHECK-NEXT:    store <8 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <8 x i32> [[RES]]
;
  %res = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
  ret <8 x i32> %res
}
declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone


define <16 x i16> @test_x86_avx2_phadd_sw(<16 x i16> %a0, <16 x i16> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_phadd_sw(
; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
; CHECK-NEXT:    store <16 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <16 x i16> [[RES]]
;
  %res = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
  ret <16 x i16> %res
}
declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind readnone


define <16 x i16> @test_x86_avx2_phadd_w(<16 x i16> %a0, <16 x i16> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_phadd_w(
; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
; CHECK-NEXT:    store <16 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <16 x i16> [[RES]]
;
  %res = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
  ret <16 x i16> %res
}
declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone


define <8 x i32> @test_x86_avx2_phsub_d(<8 x i32> %a0, <8 x i32> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_phsub_d(
; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> [[A0:%.*]], <8 x i32> [[A1:%.*]])
; CHECK-NEXT:    store <8 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <8 x i32> [[RES]]
;
  %res = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
  ret <8 x i32> %res
}
declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone


define <16 x i16> @test_x86_avx2_phsub_sw(<16 x i16> %a0, <16 x i16> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_phsub_sw(
; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
; CHECK-NEXT:    store <16 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <16 x i16> [[RES]]
;
  %res = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
  ret <16 x i16> %res
}
declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind readnone


define <16 x i16> @test_x86_avx2_phsub_w(<16 x i16> %a0, <16 x i16> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_phsub_w(
; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
; CHECK-NEXT:    store <16 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <16 x i16> [[RES]]
;
  %res = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
  ret <16 x i16> %res
}
declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readnone


define <16 x i16> @test_x86_avx2_pmadd_ub_sw(<32 x i8> %a0, <32 x i8> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_pmadd_ub_sw(
; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP3:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i8> [[TMP3]] to <16 x i16>
; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP4]], zeroinitializer
; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i16>
; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0:%.*]], <32 x i8> [[A1:%.*]])
; CHECK-NEXT:    store <16 x i16> [[TMP6]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <16 x i16> [[RES]]
;
  %res = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1) ; <<16 x i16>> [#uses=1]
  ret <16 x i16> %res
}
declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone

define <16 x i16> @test_x86_avx2_pmadd_ub_sw_load_op0(ptr %ptr, <32 x i8> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_pmadd_ub_sw_load_op0(
; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
; CHECK:       3:
; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT:    unreachable
; CHECK:       4:
; CHECK-NEXT:    [[A0:%.*]] = load <32 x i8>, ptr [[PTR:%.*]], align 32
; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i8>, ptr [[TMP7]], align 32
; CHECK-NEXT:    [[TMP8:%.*]] = or <32 x i8> [[_MSLD]], [[TMP2]]
; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <32 x i8> [[TMP8]] to <16 x i16>
; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP9]], zeroinitializer
; CHECK-NEXT:    [[TMP11:%.*]] = sext <16 x i1> [[TMP10]] to <16 x i16>
; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1:%.*]])
; CHECK-NEXT:    store <16 x i16> [[TMP11]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <16 x i16> [[RES]]
;
  %a0 = load <32 x i8>, ptr %ptr
  %res = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1) ; <<16 x i16>> [#uses=1]
  ret <16 x i16> %res
}

define <16 x i16> @test_x86_avx2_pmul_hr_sw(<16 x i16> %a0, <16 x i16> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_pmul_hr_sw(
; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
; CHECK-NEXT:    store <16 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <16 x i16> [[RES]]
;
  %res = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
  ret <16 x i16> %res
}
declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone


define <32 x i8> @test_x86_avx2_pshuf_b(<32 x i8> %a0, <32 x i8> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_pshuf_b(
; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
; CHECK-NEXT:    [[RES:%.*]] = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> [[A0:%.*]], <32 x i8> [[A1:%.*]])
; CHECK-NEXT:    store <32 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <32 x i8> [[RES]]
;
  %res = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> %a1) ; <<16 x i8>> [#uses=1]
  ret <32 x i8> %res
}
declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone


define <32 x i8> @test_x86_avx2_psign_b(<32 x i8> %a0, <32 x i8> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_psign_b(
; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
; CHECK-NEXT:    [[RES:%.*]] = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> [[A0:%.*]], <32 x i8> [[A1:%.*]])
; CHECK-NEXT:    store <32 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <32 x i8> [[RES]]
;
  %res = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
  ret <32 x i8> %res
}
declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone


define <8 x i32> @test_x86_avx2_psign_d(<8 x i32> %a0, <8 x i32> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_psign_d(
; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> [[A0:%.*]], <8 x i32> [[A1:%.*]])
; CHECK-NEXT:    store <8 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <8 x i32> [[RES]]
;
  %res = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %a0, <8 x i32> %a1) ; <<4 x i32>> [#uses=1]
  ret <8 x i32> %res
}
declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone


define <16 x i16> @test_x86_avx2_psign_w(<16 x i16> %a0, <16 x i16> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_psign_w(
; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
; CHECK-NEXT:    store <16 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <16 x i16> [[RES]]
;
  %res = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
  ret <16 x i16> %res
}
declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone


define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_mpsadbw(
; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i8> [[TMP1]] to i256
; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i8> [[TMP2]] to i256
; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
; CHECK:       5:
; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT:    unreachable
; CHECK:       6:
; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> [[A0:%.*]], <32 x i8> [[A1:%.*]], i8 7)
; CHECK-NEXT:    store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <16 x i16> [[RES]]
;
  %res = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i8 7) ; <<16 x i16>> [#uses=1]
  ret <16 x i16> %res
}
declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone

define <16 x i16> @test_x86_avx2_mpsadbw_load_op0(ptr %ptr, <32 x i8> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_mpsadbw_load_op0(
; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
; CHECK:       3:
; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT:    unreachable
; CHECK:       4:
; CHECK-NEXT:    [[A0:%.*]] = load <32 x i8>, ptr [[PTR:%.*]], align 32
; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i8>, ptr [[TMP7]], align 32
; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <32 x i8> [[_MSLD]] to i256
; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP8]], 0
; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <32 x i8> [[TMP2]] to i256
; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP9]], 0
; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
; CHECK:       10:
; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT:    unreachable
; CHECK:       11:
; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> [[A0]], <32 x i8> [[A1:%.*]], i8 7)
; CHECK-NEXT:    store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <16 x i16> [[RES]]
;
  %a0 = load <32 x i8>, ptr %ptr
  %res = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i8 7) ; <<16 x i16>> [#uses=1]
  ret <16 x i16> %res
}

define <16 x i16> @test_x86_avx2_packusdw(<8 x i32> %a0, <8 x i32> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_packusdw(
; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i32> [[TMP1]], zeroinitializer
; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i32>
; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i32> [[TMP2]], zeroinitializer
; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i32>
; CHECK-NEXT:    [[_MSPROP_VECTOR_PACK:%.*]] = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> [[TMP4]], <8 x i32> [[TMP6]])
; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> [[A0:%.*]], <8 x i32> [[A1:%.*]])
; CHECK-NEXT:    store <16 x i16> [[_MSPROP_VECTOR_PACK]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <16 x i16> [[RES]]
;
  %res = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1) ; <<16 x i16>> [#uses=1]
  ret <16 x i16> %res
}
declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone


define <16 x i16> @test_x86_avx2_packusdw_fold() #0 {
; CHECK-LABEL: @test_x86_avx2_packusdw_fold(
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[_MSPROP_VECTOR_PACK:%.*]] = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> zeroinitializer, <8 x i32> zeroinitializer)
; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> zeroinitializer, <8 x i32> <i32 255, i32 32767, i32 65535, i32 -1, i32 -32767, i32 -65535, i32 0, i32 -256>)
; CHECK-NEXT:    store <16 x i16> [[_MSPROP_VECTOR_PACK]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <16 x i16> [[RES]]
;
  %res = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> zeroinitializer, <8 x i32> <i32 255, i32 32767, i32 65535, i32 -1, i32 -32767, i32 -65535, i32 0, i32 -256>)
  ret <16 x i16> %res
}


define <32 x i8> @test_x86_avx2_pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2) #0 {
; CHECK-LABEL: @test_x86_avx2_pblendvb(
; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <32 x i8> [[_MSPROP]], [[TMP3]]
; CHECK-NEXT:    [[RES:%.*]] = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> [[A0:%.*]], <32 x i8> [[A1:%.*]], <32 x i8> [[A2:%.*]])
; CHECK-NEXT:    store <32 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <32 x i8> [[RES]]
;
  %res = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2) ; <<32 x i8>> [#uses=1]
  ret <32 x i8> %res
}
declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounwind readnone


define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_pblendw(
; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[TMP2]], <16 x i32> <i32 16, i32 17, i32 18, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]], <16 x i32> <i32 16, i32 17, i32 18, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT:    store <16 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
;
  %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i8 7) ; <<16 x i16>> [#uses=1]
  ret <16 x i16> %res
}
declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i8) nounwind readnone


define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_pblendd_128(
; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 4, i32 5, i32 6, i32 3>
; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]], <4 x i32> <i32 4, i32 5, i32 6, i32 3>
; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
;
  %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i8 7) ; <<4 x i32>> [#uses=1]
  ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i8) nounwind readnone


define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_pblendd_256(
; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 8, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> [[A1:%.*]], <8 x i32> <i32 8, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT:    store <8 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
;
  %res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i8 7) ; <<8 x i32>> [#uses=1]
  ret <8 x i32> %res
}
declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i8) nounwind readnone


define <8 x i32> @test_x86_avx2_permd(<8 x i32> %a0, <8 x i32> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_permd(
; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> [[A0:%.*]], <8 x i32> [[A1:%.*]])
; CHECK-NEXT:    store <8 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <8 x i32> [[RES]]
;
  %res = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
  ret <8 x i32> %res
}
declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly


define <8 x float> @test_x86_avx2_permps(<8 x float> %a0, <8 x i32> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_permps(
; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
; CHECK:       5:
; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT:    unreachable
; CHECK:       6:
; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx2.permps(<8 x float> [[A0:%.*]], <8 x i32> [[A1:%.*]])
; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <8 x float> [[RES]]
;
  %res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %a1) ; <<8 x float>> [#uses=1]
  ret <8 x float> %res
}
declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly


define <2 x i64> @test_x86_avx2_maskload_q(ptr %a0, <2 x i64> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_maskload_q(
; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0
; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
; CHECK:       4:
; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT:    unreachable
; CHECK:       5:
; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.x86.avx2.maskload.q(ptr [[A0:%.*]], <2 x i64> [[A1:%.*]])
; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <2 x i64> [[RES]]
;
  %res = call <2 x i64> @llvm.x86.avx2.maskload.q(ptr %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
  ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.avx2.maskload.q(ptr, <2 x i64>) nounwind readonly


define <4 x i64> @test_x86_avx2_maskload_q_256(ptr %a0, <4 x i64> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_maskload_q_256(
; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0
; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
; CHECK:       4:
; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT:    unreachable
; CHECK:       5:
; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr [[A0:%.*]], <4 x i64> [[A1:%.*]])
; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <4 x i64> [[RES]]
;
  %res = call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr %a0, <4 x i64> %a1) ; <<4 x i64>> [#uses=1]
  ret <4 x i64> %res
}
declare <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr, <4 x i64>) nounwind readonly


define <4 x i32> @test_x86_avx2_maskload_d(ptr %a0, <4 x i32> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_maskload_d(
; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0
; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
; CHECK:       4:
; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT:    unreachable
; CHECK:       5:
; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx2.maskload.d(ptr [[A0:%.*]], <4 x i32> [[A1:%.*]])
; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <4 x i32> [[RES]]
;
  %res = call <4 x i32> @llvm.x86.avx2.maskload.d(ptr %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
  ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.avx2.maskload.d(ptr, <4 x i32>) nounwind readonly


define <8 x i32> @test_x86_avx2_maskload_d_256(ptr %a0, <8 x i32> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_maskload_d_256(
; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0
; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
; CHECK:       4:
; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT:    unreachable
; CHECK:       5:
; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr [[A0:%.*]], <8 x i32> [[A1:%.*]])
; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <8 x i32> [[RES]]
;
  %res = call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
  ret <8 x i32> %res
}
declare <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr, <8 x i32>) nounwind readonly


define void @test_x86_avx2_maskstore_q(ptr %a0, <2 x i64> %a1, <2 x i64> %a2) #0 {
; CHECK-LABEL: @test_x86_avx2_maskstore_q(
; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0
; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
; CHECK:       6:
; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT:    unreachable
; CHECK:       7:
; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.q(ptr [[A0:%.*]], <2 x i64> [[A1:%.*]], <2 x i64> [[A2:%.*]])
; CHECK-NEXT:    ret void
;
  call void @llvm.x86.avx2.maskstore.q(ptr %a0, <2 x i64> %a1, <2 x i64> %a2)
  ret void
}
declare void @llvm.x86.avx2.maskstore.q(ptr, <2 x i64>, <2 x i64>) nounwind


define void @test_x86_avx2_maskstore_q_256(ptr %a0, <4 x i64> %a1, <4 x i64> %a2) #0 {
; CHECK-LABEL: @test_x86_avx2_maskstore_q_256(
; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0
; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
; CHECK:       6:
; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT:    unreachable
; CHECK:       7:
; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.q.256(ptr [[A0:%.*]], <4 x i64> [[A1:%.*]], <4 x i64> [[A2:%.*]])
; CHECK-NEXT:    ret void
;
  call void @llvm.x86.avx2.maskstore.q.256(ptr %a0, <4 x i64> %a1, <4 x i64> %a2)
  ret void
}
declare void @llvm.x86.avx2.maskstore.q.256(ptr, <4 x i64>, <4 x i64>) nounwind


define void @test_x86_avx2_maskstore_d(ptr %a0, <4 x i32> %a1, <4 x i32> %a2) #0 {
; CHECK-LABEL: @test_x86_avx2_maskstore_d(
; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0
; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
; CHECK:       6:
; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT:    unreachable
; CHECK:       7:
; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.d(ptr [[A0:%.*]], <4 x i32> [[A1:%.*]], <4 x i32> [[A2:%.*]])
; CHECK-NEXT:    ret void
;
  call void @llvm.x86.avx2.maskstore.d(ptr %a0, <4 x i32> %a1, <4 x i32> %a2)
  ret void
}
declare void @llvm.x86.avx2.maskstore.d(ptr, <4 x i32>, <4 x i32>) nounwind


define void @test_x86_avx2_maskstore_d_256(ptr %a0, <8 x i32> %a1, <8 x i32> %a2) #0 {
; CHECK-LABEL: @test_x86_avx2_maskstore_d_256(
; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0
; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
; CHECK:       6:
; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT:    unreachable
; CHECK:       7:
; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.d.256(ptr [[A0:%.*]], <8 x i32> [[A1:%.*]], <8 x i32> [[A2:%.*]])
; CHECK-NEXT:    ret void
;
  call void @llvm.x86.avx2.maskstore.d.256(ptr %a0, <8 x i32> %a1, <8 x i32> %a2)
  ret void
}
declare void @llvm.x86.avx2.maskstore.d.256(ptr, <8 x i32>, <8 x i32>) nounwind


define <4 x i32> @test_x86_avx2_psllv_d(<4 x i32> %a0, <4 x i32> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_psllv_d(
; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer
; CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i32> [[TMP5]], [[TMP4]]
; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> [[A0:%.*]], <4 x i32> [[A1]])
; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <4 x i32> [[RES]]
;
  %res = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
  ret <4 x i32> %res
}

define <4 x i32> @test_x86_avx2_psllv_d_const() #0 {
; CHECK-LABEL: @test_x86_avx2_psllv_d_const(
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> zeroinitializer, <4 x i32> <i32 1, i32 0, i32 33, i32 -1>)
; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i32> [[TMP1]], zeroinitializer
; CHECK-NEXT:    [[RES0:%.*]] = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> <i32 2, i32 9, i32 0, i32 -1>, <4 x i32> <i32 1, i32 0, i32 33, i32 -1>)
; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> zeroinitializer, <4 x i32> <i32 1, i32 1, i32 1, i32 -1>)
; CHECK-NEXT:    [[TMP4:%.*]] = or <4 x i32> [[TMP3]], zeroinitializer
; CHECK-NEXT:    [[RES1:%.*]] = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> <i32 1, i32 1, i32 1, i32 -1>, <4 x i32> <i32 1, i32 1, i32 1, i32 -1>)
; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP4]]
; CHECK-NEXT:    [[RES2:%.*]] = add <4 x i32> [[RES0]], [[RES1]]
; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <4 x i32> [[RES2]]
;
  %res0 = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> <i32 2, i32 9, i32 0, i32 -1>, <4 x i32> <i32 1, i32 0, i32 33, i32 -1>)
  %res1 = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> <i32 1, i32 1, i32 1, i32 -1>, <4 x i32> <i32 1, i32 1, i32 1,  i32 -1>)
  %res2 = add <4 x i32> %res0, %res1
  ret <4 x i32> %res2
}
declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone


define <8 x i32> @test_x86_avx2_psllv_d_256(<8 x i32> %a0, <8 x i32> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_psllv_d_256(
; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i32> [[TMP2]], zeroinitializer
; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i32>
; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> [[TMP1]], <8 x i32> [[A1:%.*]])
; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i32> [[TMP5]], [[TMP4]]
; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> [[A0:%.*]], <8 x i32> [[A1]])
; CHECK-NEXT:    store <8 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <8 x i32> [[RES]]
;
  %res = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
  ret <8 x i32> %res
}

define <8 x i32> @test_x86_avx2_psllv_d_256_const() #0 {
; CHECK-LABEL: @test_x86_avx2_psllv_d_256_const(
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> zeroinitializer, <8 x i32> <i32 1, i32 0, i32 33, i32 -1, i32 2, i32 0, i32 34, i32 -2>)
; CHECK-NEXT:    [[TMP2:%.*]] = or <8 x i32> [[TMP1]], zeroinitializer
; CHECK-NEXT:    [[RES0:%.*]] = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> <i32 2, i32 9, i32 0, i32 -1, i32 3, i32 7, i32 -1, i32 0>, <8 x i32> <i32 1, i32 0, i32 33, i32 -1, i32 2, i32 0, i32 34, i32 -2>)
; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> zeroinitializer, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 -1>)
; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i32> [[TMP3]], zeroinitializer
; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 -1>, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 -1>)
; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP2]], [[TMP4]]
; CHECK-NEXT:    [[RES2:%.*]] = add <8 x i32> [[RES0]], [[RES1]]
; CHECK-NEXT:    store <8 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <8 x i32> [[RES2]]
;
  %res0 = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> <i32 2, i32 9, i32 0, i32 -1, i32 3, i32 7, i32 -1, i32 0>, <8 x i32> <i32 1, i32 0, i32 33, i32 -1,i32 2, i32 0, i32 34, i32 -2>)
  %res1 = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 -1>, <8 x i32> <i32 1, i32 1, i32 1,  i32 1, i32 1, i32 1, i32 1, i32 -1>)
  %res2 = add <8 x i32> %res0, %res1
  ret <8 x i32> %res2
}
declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone


define <2 x i64> @test_x86_avx2_psllv_q(<2 x i64> %a0, <2 x i64> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_psllv_q(
; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i64> [[TMP2]], zeroinitializer
; CHECK-NEXT:    [[TMP4:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64>
; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
; CHECK-NEXT:    [[TMP6:%.*]] = or <2 x i64> [[TMP5]], [[TMP4]]
; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> [[A0:%.*]], <2 x i64> [[A1]])
; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <2 x i64> [[RES]]
;
  %res = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
  ret <2 x i64> %res
}
define <2 x i64> @test_x86_avx2_psllv_q_const() #0 {
; CHECK-LABEL: @test_x86_avx2_psllv_q_const(
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> zeroinitializer, <2 x i64> <i64 1, i64 -1>)
; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i64> [[TMP1]], zeroinitializer
; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> <i64 4, i64 -1>, <2 x i64> <i64 1, i64 -1>)
; CHECK-NEXT:    store <2 x i64> [[TMP2]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <2 x i64> [[RES]]
;
  %res = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> <i64 4, i64 -1>, <2 x i64> <i64 1, i64 -1>)
  ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone


define <4 x i64> @test_x86_avx2_psllv_q_256(<4 x i64> %a0, <4 x i64> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_psllv_q_256(
; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i64> [[TMP2]], zeroinitializer
; CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i64>
; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> [[TMP1]], <4 x i64> [[A1:%.*]])
; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i64> [[TMP5]], [[TMP4]]
; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> [[A0:%.*]], <4 x i64> [[A1]])
; CHECK-NEXT:    store <4 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <4 x i64> [[RES]]
;
  %res = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1) ; <<4 x i64>> [#uses=1]
  ret <4 x i64> %res
}

define <4 x i64> @test_x86_avx2_psllv_q_256_const() #0 {
; CHECK-LABEL: @test_x86_avx2_psllv_q_256_const(
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> zeroinitializer, <4 x i64> <i64 1, i64 1, i64 1, i64 -1>)
; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i64> [[TMP1]], zeroinitializer
; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> <i64 4, i64 4, i64 4, i64 -1>, <4 x i64> <i64 1, i64 1, i64 1, i64 -1>)
; CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <4 x i64> [[RES]]
;
  %res = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> <i64 4, i64 4, i64 4, i64 -1>, <4 x i64> <i64 1, i64 1, i64 1, i64 -1>)
  ret <4 x i64> %res
}
declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone


define <4 x i32> @test_x86_avx2_psrlv_d(<4 x i32> %a0, <4 x i32> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_psrlv_d(
; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer
; CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i32> [[TMP5]], [[TMP4]]
; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> [[A0:%.*]], <4 x i32> [[A1]])
; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <4 x i32> [[RES]]
;
  %res = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
  ret <4 x i32> %res
}

define <4 x i32> @test_x86_avx2_psrlv_d_const() #0 {
; CHECK-LABEL: @test_x86_avx2_psrlv_d_const(
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> zeroinitializer, <4 x i32> <i32 1, i32 0, i32 33, i32 -1>)
; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i32> [[TMP1]], zeroinitializer
; CHECK-NEXT:    [[RES0:%.*]] = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> <i32 2, i32 9, i32 0, i32 -1>, <4 x i32> <i32 1, i32 0, i32 33, i32 -1>)
; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> zeroinitializer, <4 x i32> <i32 1, i32 1, i32 1, i32 -1>)
; CHECK-NEXT:    [[TMP4:%.*]] = or <4 x i32> [[TMP3]], zeroinitializer
; CHECK-NEXT:    [[RES1:%.*]] = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> <i32 4, i32 4, i32 4, i32 -1>, <4 x i32> <i32 1, i32 1, i32 1, i32 -1>)
; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP4]]
; CHECK-NEXT:    [[RES2:%.*]] = add <4 x i32> [[RES0]], [[RES1]]
; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <4 x i32> [[RES2]]
;
  %res0 = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> <i32 2, i32 9, i32 0, i32 -1>, <4 x i32> <i32 1, i32 0, i32 33, i32 -1>)
  %res1 = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> <i32 4, i32 4, i32 4, i32 -1>, <4 x i32> <i32 1, i32 1, i32 1,  i32 -1>)
  %res2 = add <4 x i32> %res0, %res1
  ret <4 x i32> %res2
}
declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone


define <8 x i32> @test_x86_avx2_psrlv_d_256(<8 x i32> %a0, <8 x i32> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_psrlv_d_256(
; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i32> [[TMP2]], zeroinitializer
; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i32>
; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> [[TMP1]], <8 x i32> [[A1:%.*]])
; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i32> [[TMP5]], [[TMP4]]
; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> [[A0:%.*]], <8 x i32> [[A1]])
; CHECK-NEXT:    store <8 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <8 x i32> [[RES]]
;
  %res = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
  ret <8 x i32> %res
}

define <8 x i32> @test_x86_avx2_psrlv_d_256_const() #0 {
; CHECK-LABEL: @test_x86_avx2_psrlv_d_256_const(
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> zeroinitializer, <8 x i32> <i32 1, i32 0, i32 33, i32 -1, i32 2, i32 0, i32 34, i32 -2>)
; CHECK-NEXT:    [[TMP2:%.*]] = or <8 x i32> [[TMP1]], zeroinitializer
; CHECK-NEXT:    [[RES0:%.*]] = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> <i32 2, i32 9, i32 0, i32 -1, i32 3, i32 7, i32 -1, i32 0>, <8 x i32> <i32 1, i32 0, i32 33, i32 -1, i32 2, i32 0, i32 34, i32 -2>)
; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> zeroinitializer, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 -1>)
; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i32> [[TMP3]], zeroinitializer
; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 -1>, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 -1>)
; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP2]], [[TMP4]]
; CHECK-NEXT:    [[RES2:%.*]] = add <8 x i32> [[RES0]], [[RES1]]
; CHECK-NEXT:    store <8 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <8 x i32> [[RES2]]
;
  %res0 = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> <i32 2, i32 9, i32 0, i32 -1, i32 3, i32 7, i32 -1, i32 0>, <8 x i32> <i32 1, i32 0, i32 33, i32 -1,i32 2, i32 0, i32 34, i32 -2>)
  %res1 = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 -1>, <8 x i32> <i32 1, i32 1, i32 1,  i32 1, i32 1, i32 1, i32 1, i32 -1>)
  %res2 = add <8 x i32> %res0, %res1
  ret <8 x i32> %res2
}
declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone


define <2 x i64> @test_x86_avx2_psrlv_q(<2 x i64> %a0, <2 x i64> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_psrlv_q(
; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i64> [[TMP2]], zeroinitializer
; CHECK-NEXT:    [[TMP4:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64>
; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
; CHECK-NEXT:    [[TMP6:%.*]] = or <2 x i64> [[TMP5]], [[TMP4]]
; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> [[A0:%.*]], <2 x i64> [[A1]])
; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <2 x i64> [[RES]]
;
  %res = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
  ret <2 x i64> %res
}

define <2 x i64> @test_x86_avx2_psrlv_q_const() #0 {
; CHECK-LABEL: @test_x86_avx2_psrlv_q_const(
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> zeroinitializer, <2 x i64> <i64 1, i64 -1>)
; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i64> [[TMP1]], zeroinitializer
; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> <i64 4, i64 4>, <2 x i64> <i64 1, i64 -1>)
; CHECK-NEXT:    store <2 x i64> [[TMP2]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <2 x i64> [[RES]]
;
  %res = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> <i64 4, i64 4>, <2 x i64> <i64 1, i64 -1>)
  ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone


define <4 x i64> @test_x86_avx2_psrlv_q_256(<4 x i64> %a0, <4 x i64> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_psrlv_q_256(
; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i64> [[TMP2]], zeroinitializer
; CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i64>
; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> [[TMP1]], <4 x i64> [[A1:%.*]])
; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i64> [[TMP5]], [[TMP4]]
; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> [[A0:%.*]], <4 x i64> [[A1]])
; CHECK-NEXT:    store <4 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <4 x i64> [[RES]]
;
  %res = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1) ; <<4 x i64>> [#uses=1]
  ret <4 x i64> %res
}


define <4 x i64> @test_x86_avx2_psrlv_q_256_const() #0 {
; CHECK-LABEL: @test_x86_avx2_psrlv_q_256_const(
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> zeroinitializer, <4 x i64> <i64 1, i64 1, i64 1, i64 -1>)
; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i64> [[TMP1]], zeroinitializer
; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> <i64 4, i64 4, i64 4, i64 4>, <4 x i64> <i64 1, i64 1, i64 1, i64 -1>)
; CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <4 x i64> [[RES]]
;
  %res = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> <i64 4, i64 4, i64 4, i64 4>, <4 x i64> <i64 1, i64 1, i64 1, i64 -1>)
  ret <4 x i64> %res
}
declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone


define <4 x i32> @test_x86_avx2_psrav_d(<4 x i32> %a0, <4 x i32> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_psrav_d(
; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer
; CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i32> [[TMP5]], [[TMP4]]
; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> [[A0:%.*]], <4 x i32> [[A1]])
; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <4 x i32> [[RES]]
;
  %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
  ret <4 x i32> %res
}

define <4 x i32> @test_x86_avx2_psrav_d_const() #0 {
; CHECK-LABEL: @test_x86_avx2_psrav_d_const(
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> zeroinitializer, <4 x i32> <i32 1, i32 18, i32 35, i32 52>)
; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i32> [[TMP1]], zeroinitializer
; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> <i32 2, i32 9, i32 -12, i32 23>, <4 x i32> <i32 1, i32 18, i32 35, i32 52>)
; CHECK-NEXT:    store <4 x i32> [[TMP2]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <4 x i32> [[RES]]
;
  %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> <i32 2, i32 9, i32 -12, i32 23>, <4 x i32> <i32 1, i32 18, i32 35, i32 52>)
  ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone

define <8 x i32> @test_x86_avx2_psrav_d_256(<8 x i32> %a0, <8 x i32> %a1) #0 {
; CHECK-LABEL: @test_x86_avx2_psrav_d_256(
; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i32> [[TMP2]], zeroinitializer
; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i32>
; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> [[TMP1]], <8 x i32> [[A1:%.*]])
; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i32> [[TMP5]], [[TMP4]]
; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> [[A0:%.*]], <8 x i32> [[A1]])
; CHECK-NEXT:    store <8 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <8 x i32> [[RES]]
;
  %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
  ret <8 x i32> %res
}

define <8 x i32> @test_x86_avx2_psrav_d_256_const() #0 {
; CHECK-LABEL: @test_x86_avx2_psrav_d_256_const(
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> zeroinitializer, <8 x i32> <i32 1, i32 18, i32 35, i32 52, i32 69, i32 15, i32 32, i32 49>)
; CHECK-NEXT:    [[TMP2:%.*]] = or <8 x i32> [[TMP1]], zeroinitializer
; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> <i32 2, i32 9, i32 -12, i32 23, i32 -26, i32 37, i32 -40, i32 51>, <8 x i32> <i32 1, i32 18, i32 35, i32 52, i32 69, i32 15, i32 32, i32 49>)
; CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <8 x i32> [[RES]]
;
  %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> <i32 2, i32 9, i32 -12, i32 23, i32 -26, i32 37, i32 -40, i32 51>, <8 x i32> <i32 1, i32 18, i32 35, i32 52, i32 69, i32 15, i32 32, i32 49>)
  ret <8 x i32> %res
}
declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone

define <2 x double> @test_x86_avx2_gather_d_pd(<2 x double> %a0, ptr %a1, <4 x i32> %idx, <2 x double> %mask) #0 {
; CHECK-LABEL: @test_x86_avx2_gather_d_pd(
; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP2]], 0
; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0
; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP4]] to i128
; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP7]], 0
; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
; CHECK:       8:
; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT:    unreachable
; CHECK:       9:
; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> [[A0:%.*]], ptr [[A1:%.*]], <4 x i32> [[IDX:%.*]], <2 x double> [[MASK:%.*]], i8 2)
; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <2 x double> [[RES]]
;
  %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %a0,
  ptr %a1, <4 x i32> %idx, <2 x double> %mask, i8 2) ;
  ret <2 x double> %res
}
declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, ptr,
  <4 x i32>, <2 x double>, i8) nounwind readonly

define <4 x double> @test_x86_avx2_gather_d_pd_256(<4 x double> %a0, ptr %a1, <4 x i32> %idx, <4 x double> %mask) #0 {
; CHECK-LABEL: @test_x86_avx2_gather_d_pd_256(
; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP2]], 0
; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0
; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP4]] to i256
; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i256 [[TMP7]], 0
; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
; CHECK:       8:
; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT:    unreachable
; CHECK:       9:
; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> [[A0:%.*]], ptr [[A1:%.*]], <4 x i32> [[IDX:%.*]], <4 x double> [[MASK:%.*]], i8 2)
; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <4 x double> [[RES]]
;
  %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %a0,
  ptr %a1, <4 x i32> %idx, <4 x double> %mask, i8 2) ;
  ret <4 x double> %res
}
declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, ptr,
  <4 x i32>, <4 x double>, i8) nounwind readonly

define <2 x double> @test_x86_avx2_gather_q_pd(<2 x double> %a0, ptr %a1, <2 x i64> %idx, <2 x double> %mask) #0 {
; CHECK-LABEL: @test_x86_avx2_gather_q_pd(
; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP2]], 0
; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0
; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP4]] to i128
; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP7]], 0
; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
; CHECK:       8:
; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT:    unreachable
; CHECK:       9:
; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> [[A0:%.*]], ptr [[A1:%.*]], <2 x i64> [[IDX:%.*]], <2 x double> [[MASK:%.*]], i8 2)
; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <2 x double> [[RES]]
;
  %res = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %a0,
  ptr %a1, <2 x i64> %idx, <2 x double> %mask, i8 2) ;
  ret <2 x double> %res
}
declare <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double>, ptr,
  <2 x i64>, <2 x double>, i8) nounwind readonly

define <4 x double> @test_x86_avx2_gather_q_pd_256(<4 x double> %a0, ptr %a1, <4 x i64> %idx, <4 x double> %mask) #0 {
; CHECK-LABEL: @test_x86_avx2_gather_q_pd_256(
; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP2]], 0
; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0
; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP4]] to i256
; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i256 [[TMP7]], 0
; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
; CHECK:       8:
; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT:    unreachable
; CHECK:       9:
; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> [[A0:%.*]], ptr [[A1:%.*]], <4 x i64> [[IDX:%.*]], <4 x double> [[MASK:%.*]], i8 2)
; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <4 x double> [[RES]]
;
  %res = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %a0,
  ptr %a1, <4 x i64> %idx, <4 x double> %mask, i8 2) ;
  ret <4 x double> %res
}
declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double>, ptr,
  <4 x i64>, <4 x double>, i8) nounwind readonly

define <4 x float> @test_x86_avx2_gather_d_ps(<4 x float> %a0, ptr %a1, <4 x i32> %idx, <4 x float> %mask) #0 {
; CHECK-LABEL: @test_x86_avx2_gather_d_ps(
; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP2]], 0
; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0
; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP4]] to i128
; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP7]], 0
; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
; CHECK:       8:
; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT:    unreachable
; CHECK:       9:
; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> [[A0:%.*]], ptr [[A1:%.*]], <4 x i32> [[IDX:%.*]], <4 x float> [[MASK:%.*]], i8 2)
; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <4 x float> [[RES]]
;
  %res = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %a0,
  ptr %a1, <4 x i32> %idx, <4 x float> %mask, i8 2) ;
  ret <4 x float> %res
}
declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, ptr,
  <4 x i32>, <4 x float>, i8) nounwind readonly

define <8 x float> @test_x86_avx2_gather_d_ps_256(<8 x float> %a0, ptr %a1, <8 x i32> %idx, <8 x float> %mask) #0 {
; CHECK-LABEL: @test_x86_avx2_gather_d_ps_256(
; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP2]], 0
; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0
; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP4]] to i256
; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i256 [[TMP7]], 0
; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
; CHECK:       8:
; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT:    unreachable
; CHECK:       9:
; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> [[A0:%.*]], ptr [[A1:%.*]], <8 x i32> [[IDX:%.*]], <8 x float> [[MASK:%.*]], i8 2)
; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <8 x float> [[RES]]
;
  %res = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0,
  ptr %a1, <8 x i32> %idx, <8 x float> %mask, i8 2) ;
  ret <8 x float> %res
}
declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, ptr,
  <8 x i32>, <8 x float>, i8) nounwind readonly

define <4 x float> @test_x86_avx2_gather_q_ps(<4 x float> %a0, ptr %a1, <2 x i64> %idx, <4 x float> %mask) #0 {
; CHECK-LABEL: @test_x86_avx2_gather_q_ps(
; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP2]], 0
; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0
; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP4]] to i128
; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP7]], 0
; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
; CHECK:       8:
; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT:    unreachable
; CHECK:       9:
; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> [[A0:%.*]], ptr [[A1:%.*]], <2 x i64> [[IDX:%.*]], <4 x float> [[MASK:%.*]], i8 2)
; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <4 x float> [[RES]]
;
  %res = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %a0,
  ptr %a1, <2 x i64> %idx, <4 x float> %mask, i8 2) ;
  ret <4 x float> %res
}
declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, ptr,
  <2 x i64>, <4 x float>, i8) nounwind readonly

define <4 x float> @test_x86_avx2_gather_q_ps_256(<4 x float> %a0, ptr %a1, <4 x i64> %idx, <4 x float> %mask) #0 {
; CHECK-LABEL: @test_x86_avx2_gather_q_ps_256(
; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP2]], 0
; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0
; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP4]] to i128
; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP7]], 0
; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
; CHECK:       8:
; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT:    unreachable
; CHECK:       9:
; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> [[A0:%.*]], ptr [[A1:%.*]], <4 x i64> [[IDX:%.*]], <4 x float> [[MASK:%.*]], i8 2)
; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <4 x float> [[RES]]
;
  %res = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0,
  ptr %a1, <4 x i64> %idx, <4 x float> %mask, i8 2) ;
  ret <4 x float> %res
}
declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float>, ptr,
  <4 x i64>, <4 x float>, i8) nounwind readonly

define <2 x i64> @test_x86_avx2_gather_d_q(<2 x i64> %a0, ptr %a1, <4 x i32> %idx, <2 x i64> %mask) #0 {
; CHECK-LABEL: @test_x86_avx2_gather_d_q(
; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP2]], 0
; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0
; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP4]] to i128
; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP7]], 0
; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
; CHECK:       8:
; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT:    unreachable
; CHECK:       9:
; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> [[A0:%.*]], ptr [[A1:%.*]], <4 x i32> [[IDX:%.*]], <2 x i64> [[MASK:%.*]], i8 2)
; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <2 x i64> [[RES]]
;
  %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> %a0,
  ptr %a1, <4 x i32> %idx, <2 x i64> %mask, i8 2) ;
  ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64>, ptr,
  <4 x i32>, <2 x i64>, i8) nounwind readonly

define <4 x i64> @test_x86_avx2_gather_d_q_256(<4 x i64> %a0, ptr %a1, <4 x i32> %idx, <4 x i64> %mask) #0 {
; CHECK-LABEL: @test_x86_avx2_gather_d_q_256(
; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP2]], 0
; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0
; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP4]] to i256
; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i256 [[TMP7]], 0
; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
; CHECK:       8:
; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT:    unreachable
; CHECK:       9:
; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> [[A0:%.*]], ptr [[A1:%.*]], <4 x i32> [[IDX:%.*]], <4 x i64> [[MASK:%.*]], i8 2)
; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <4 x i64> [[RES]]
;
  %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %a0,
  ptr %a1, <4 x i32> %idx, <4 x i64> %mask, i8 2) ;
  ret <4 x i64> %res
}
declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, ptr,
  <4 x i32>, <4 x i64>, i8) nounwind readonly

define <2 x i64> @test_x86_avx2_gather_q_q(<2 x i64> %a0, ptr %a1, <2 x i64> %idx, <2 x i64> %mask) #0 {
; CHECK-LABEL: @test_x86_avx2_gather_q_q(
; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP2]], 0
; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0
; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP4]] to i128
; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP7]], 0
; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
; CHECK:       8:
; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT:    unreachable
; CHECK:       9:
; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> [[A0:%.*]], ptr [[A1:%.*]], <2 x i64> [[IDX:%.*]], <2 x i64> [[MASK:%.*]], i8 2)
; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <2 x i64> [[RES]]
;
  %res = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %a0,
  ptr %a1, <2 x i64> %idx, <2 x i64> %mask, i8 2) ;
  ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64>, ptr,
  <2 x i64>, <2 x i64>, i8) nounwind readonly

define <4 x i64> @test_x86_avx2_gather_q_q_256(<4 x i64> %a0, ptr %a1, <4 x i64> %idx, <4 x i64> %mask) #0 {
; CHECK-LABEL: @test_x86_avx2_gather_q_q_256(
; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP2]], 0
; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0
; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP4]] to i256
; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i256 [[TMP7]], 0
; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
; CHECK:       8:
; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT:    unreachable
; CHECK:       9:
; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> [[A0:%.*]], ptr [[A1:%.*]], <4 x i64> [[IDX:%.*]], <4 x i64> [[MASK:%.*]], i8 2)
; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <4 x i64> [[RES]]
;
  %res = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %a0,
  ptr %a1, <4 x i64> %idx, <4 x i64> %mask, i8 2) ;
  ret <4 x i64> %res
}
declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64>, ptr,
  <4 x i64>, <4 x i64>, i8) nounwind readonly

define <4 x i32> @test_x86_avx2_gather_d_d(<4 x i32> %a0, ptr %a1, <4 x i32> %idx, <4 x i32> %mask) #0 {
; CHECK-LABEL: @test_x86_avx2_gather_d_d(
; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP2]], 0
; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0
; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP4]] to i128
; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP7]], 0
; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
; CHECK:       8:
; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT:    unreachable
; CHECK:       9:
; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> [[A0:%.*]], ptr [[A1:%.*]], <4 x i32> [[IDX:%.*]], <4 x i32> [[MASK:%.*]], i8 2)
; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <4 x i32> [[RES]]
;
  %res = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %a0,
  ptr %a1, <4 x i32> %idx, <4 x i32> %mask, i8 2) ;
  ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, ptr,
  <4 x i32>, <4 x i32>, i8) nounwind readonly

define <8 x i32> @test_x86_avx2_gather_d_d_256(<8 x i32> %a0, ptr %a1, <8 x i32> %idx, <8 x i32> %mask) #0 {
; CHECK-LABEL: @test_x86_avx2_gather_d_d_256(
; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP2]], 0
; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0
; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP4]] to i256
; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i256 [[TMP7]], 0
; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
; CHECK:       8:
; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT:    unreachable
; CHECK:       9:
; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> [[A0:%.*]], ptr [[A1:%.*]], <8 x i32> [[IDX:%.*]], <8 x i32> [[MASK:%.*]], i8 2)
; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <8 x i32> [[RES]]
;
  %res = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %a0,
  ptr %a1, <8 x i32> %idx, <8 x i32> %mask, i8 2) ;
  ret <8 x i32> %res
}
declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, ptr,
  <8 x i32>, <8 x i32>, i8) nounwind readonly

define <4 x i32> @test_x86_avx2_gather_q_d(<4 x i32> %a0, ptr %a1, <2 x i64> %idx, <4 x i32> %mask) #0 {
; CHECK-LABEL: @test_x86_avx2_gather_q_d(
; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP2]], 0
; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0
; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP4]] to i128
; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP7]], 0
; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
; CHECK:       8:
; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT:    unreachable
; CHECK:       9:
; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> [[A0:%.*]], ptr [[A1:%.*]], <2 x i64> [[IDX:%.*]], <4 x i32> [[MASK:%.*]], i8 2)
; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <4 x i32> [[RES]]
;
  %res = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %a0,
  ptr %a1, <2 x i64> %idx, <4 x i32> %mask, i8 2) ;
  ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32>, ptr,
  <2 x i64>, <4 x i32>, i8) nounwind readonly

define <4 x i32> @test_x86_avx2_gather_q_d_256(<4 x i32> %a0, ptr %a1, <4 x i64> %idx, <4 x i32> %mask) #0 {
; CHECK-LABEL: @test_x86_avx2_gather_q_d_256(
; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP2]], 0
; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0
; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP4]] to i128
; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP7]], 0
; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
; CHECK:       8:
; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT:    unreachable
; CHECK:       9:
; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> [[A0:%.*]], ptr [[A1:%.*]], <4 x i64> [[IDX:%.*]], <4 x i32> [[MASK:%.*]], i8 2)
; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <4 x i32> [[RES]]
;
  %res = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %a0,
  ptr %a1, <4 x i64> %idx, <4 x i32> %mask, i8 2) ;
  ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, ptr,
  <4 x i64>, <4 x i32>, i8) nounwind readonly

define <8 x float>  @test_gather_mask(<8 x float> %a0, ptr %a, <8 x i32> %idx, <8 x float> %mask, ptr nocapture %out) #0 {
; CHECK-LABEL: @test_gather_mask(
; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP6]], 0
; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP2]], 0
; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP7]], 0
; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i32> [[TMP4]] to i256
; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i256 [[TMP8]], 0
; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
; CHECK:       9:
; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT:    unreachable
; CHECK:       10:
; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> [[A0:%.*]], ptr [[A:%.*]], <8 x i32> [[IDX:%.*]], <8 x float> [[MASK:%.*]], i8 4)
; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP5]], 0
; CHECK-NEXT:    br i1 [[_MSCMP6]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF0]]
; CHECK:       11:
; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT:    unreachable
; CHECK:       12:
; CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[OUT:%.*]] to i64
; CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 87960930222080
; CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
; CHECK-NEXT:    store <8 x i32> [[TMP4]], ptr [[TMP15]], align 4
; CHECK-NEXT:    store <8 x float> [[MASK]], ptr [[OUT]], align 4
; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <8 x float> [[RES]]
;
  %res = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0,
  ptr %a, <8 x i32> %idx, <8 x float> %mask, i8 4) ;

  store <8 x float> %mask, ptr %out, align 4

  ret <8 x float> %res
}

define <2 x i64> @test_mask_demanded_bits(<2 x i64> %a0, ptr %a1, <2 x i64> %idx, <2 x i1> %mask) #0 {
; CHECK-LABEL: @test_mask_demanded_bits(
; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i1>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
; CHECK-NEXT:    call void @llvm.donothing()
; CHECK-NEXT:    [[_MSPROP:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
; CHECK-NEXT:    [[MASK1:%.*]] = sext <2 x i1> [[MASK:%.*]] to <2 x i64>
; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP3]], 0
; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP4]] to i128
; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0
; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[_MSPROP]] to i128
; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP7]], 0
; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
; CHECK:       8:
; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT:    unreachable
; CHECK:       9:
; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> [[A0:%.*]], ptr [[A1:%.*]], <2 x i64> [[IDX:%.*]], <2 x i64> [[MASK1]], i8 2)
; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
; CHECK-NEXT:    ret <2 x i64> [[RES]]
;
  %mask1 = sext <2 x i1> %mask to <2 x i64>
  %res = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %a0,
  ptr %a1, <2 x i64> %idx, <2 x i64> %mask1, i8 2) ;
  ret <2 x i64> %res
}

attributes #0 = { sanitize_memory }
